Raw data available at GSEXXXXXX
Abstract
Abstract goes here
# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE,
warning = FALSE,
error=FALSE,
echo=TRUE,
cache=FALSE,
fig.width = 7, fig.height = 6,
fig.align = 'left')
# Reading in silico PCR results
f_dir <- paste0(wd, "All_chrom_in_silico_PCR_files_GRCh38/")
f_list <- list.files(f_dir)
in_silico_list <- lapply(1:21, function(x) read.csv(paste0(f_dir, f_list[x]), header = TRUE))
df_in_silico <- rbindlist(in_silico_list)
#############
df_in_silico$X <- NULL
# Renaming columns to indicate their coordinate genome version
colnames(df_in_silico)[10] <- "Chr_GRCh38_pred"
colnames(df_in_silico)[11] <- "Amplicon_Start_GRCh38"
colnames(df_in_silico)[12] <- "Amplicon_End_GRCh38"
colnames(df_in_silico)[13] <- "Amplicon_length_GRCh38"
colnames(df_in_silico)[15] <- "Sequence_GRCh38"
#head(df_in_silico[,1:14])
#dim(df_in_silico) # 2086 PCR products from 1964 primer pairs.
# Flag amplicons with multiple PCR products
# In silico PCR was run with maxProductSize = 1500 bp.
# Most byproducts have much shorter lengths, making them likely to be packaged into viruses, which has a capacity of 900 bp.
# In silico prediction of longer products that may not be packaged into AAV due to length constraint may cause reduced efficiency of PCR amplification and decreased abundance of these amplicons.
# Conclusion: In-silico PCR confirmed high specificity of primer design, and viral packaging.
# There are 39 amplicons with more than 1 predicted in silico PCR product.
df <- as.data.frame(table(df_in_silico$UNIQID))
unspecific_PCR <- dplyr::filter(df, Freq > 1)
# Unspecific amplicons and their product sizes
#as.data.frame(dplyr::filter(df_in_silico, UNIQID %in% unspecific_PCR$Var1)[,c("UNIQID", "Amplicon_length_GRCh38", "PCR_efficiency")])
# A histogram of isPCR product lengths.
p1 <- ggplot(df_in_silico, aes(x = Amplicon_length_GRCh38))+
geom_density()+
geom_vline(xintercept = median(na.omit(df_in_silico$Amplicon_length_GRCh38)), color = "red", linetype = 2)+
annotate("text", label = "median = 906 bp", x = 950, y = 0.05, size = 6, colour = "red")+
theme_bw()+
labs(title = "A histogram of all isPCR product lengths", x = "Amplicon lengths [bp]")+
theme(plot.title = element_text(hjust = 0.5))+
coord_cartesian(xlim = c(800, 1000))
# A histogram of isPCR product lengths predicted as unspecific
p2 <- ggplot(dplyr::filter(df_in_silico, UNIQID %in% unspecific_PCR$Var1),
aes(x = Amplicon_length_GRCh38))+
geom_density()+
theme_bw()+
labs(title = "A histogram of unspecific isPCR product lengths", x = "Amplicon lengths [bp]")+
theme(plot.title = element_text(hjust = 0.5))
plot_grid(p1, p2,
labels = c('A', 'B'),
rel_widths = c(1.2, 1.2),
vjust = 1.5)
Histograms of isPCR products. (A) All isPCR products, n = 2086; (B) Unspecific isPCR products, n = 128; Unique primer pairs, n = 1997
# Conclusions:
# 1. The majority of unspecific products have lengths compatible with AAV packaging
# 2. There are two PCR products of 1450 bp, and one 78 bp
# Defines Perfect_In_silico_specificity
df_in_silico$Perfect_In_silico_specificity <- ifelse(df_in_silico$UNIQID %in% as.character(unspecific_PCR$Var1), FALSE, TRUE)
# GC content calculation
GC_cont <- function(x){
x <- toupper(x)
num_g <- str_count(x, "G")
num_c <- str_count(x, "C")
gc_content <- (num_g + num_c) / str_length(x)
gc_content
}
df_in_silico$GC_content <- GC_cont(df_in_silico$Sequence)
# I'm flagging amplicons if they are intended products if they chromosomes match, and PCR product length is +/- 5 bp from the product predicted on hg19 genome.
# Fixing X chrom name and chrom class encoding to permit logical operations
# The line below is an ugly hack.
#df_in_silico$CHROMOSOME <- ifelse(is.na(as.numeric(df_in_silico$CHROMOSOME)), 23, as.numeric(df_in_silico$chr))
#df_in_silico$Chr_GRCh38_pred <- as.numeric(df_in_silico$Chr_GRCh38_pred)
#df_in_silico$chr <- as.numeric(df_in_silico$chr)
# Removes 4 records lacking predicted PCR products.
d <- na.omit(df_in_silico)
#unique(setdiff(df_in_silico$UNIQID, d$UNIQID))
#No PCR prod for:
# "1956_2_40385062_C_G_11869.p1",
# "408_16_78441213_C_G_12937.p1",
# "2712_3_68059361_A_C_13159.p1",
# "940_17_35098691_G_A_14586.p1"
# This is different from STAR408, because we don't have product lengths in primer spreadsheet
# Unlike the STAR408 project, in silico PCR is the only source of amplicon sequence length.
# Start and End coordinates denounce SNVs, not amplicons.
# Amplicons were designed to be ~900 bases.
# I called amplicons "Intended_interval" when they fall between 875 : 925.
d$Intended_interval <- sapply(1:nrow(d), function(x) d$Amplicon_length_GRCh38[x]
%in% c(875 : 925))
# Extracting extra amplicons without predicted in silico PCR products
no_PCR_amp <- dplyr::filter(df_in_silico, UNIQID %in% c("1956_2_40385062_C_G_11869.p1", "408_16_78441213_C_G_12937.p1", "2712_3_68059361_A_C_13159.p1", "940_17_35098691_G_A_14586.p1"))
no_PCR_amp$Intended_interval <- NA
# Adding amplicons lacking pred. PCR prod. to the full dataset
d2 <- rbind(d, no_PCR_amp)
# Checks if the chrom number is as expected
d2$Intended_chr <- d2$CHROMOSOME == d2$Chr_GRCh38_pred
# Intended_amplicon has expected chr and intended interval
# NOTE: consider renaming INtended_interval to Expected_isPCR_length
d2$Intended_amplicon <- d2$Intended_chr & d2$Intended_interval
#2086 - total number of amplicons
#nrow(d2)
# 2040 - amplicons meeting Intended_amplicon criteria
#sum(na.omit(d2$Intended_amplicon))
# 1997 - number of unique IDs
# length(unique(d2$UNIQID))
# Conclusion: some amplicons have multiple isPCR products and match the Intended_amplicon criteria
#Manual inspection
# 42 spurious amplicons
#dim(dplyr::filter(d2, Intended_amplicon == FALSE))
# 42 PCR byproducts
PCR_byproducts <- dplyr::filter(d2, Intended_amplicon == FALSE)
#unique(PCR_byproducts$UNIQID) # 18 UNIQID in PCR_byproducts
# For activity modeling I'll focus only on these with Perfect_In_silico_specificity
isPCR <- dplyr::filter(d2, Perfect_In_silico_specificity == TRUE)
# There are 1958 of these amplicons. This is a good set I'll be using for lm modeling
#nrow(isPCR)
#Let's load the V4 and V5 data. This code has been adapted from Linda Su-Feher analysis in 20200116_Combined_Analysis.R
## load v4 + v5 data
## dims: 2576 x 19-22
data.v4 <- read.table("Amplicon_Counts/ASD1KV4_Combined_Amplicons_DupRem.txt", sep = "\t", header = T, quote = "")
data.v5 <- read.table("Amplicon_Counts/ASD1KV5_Combined_Amplicons_DupRem.txt", sep = "\t", header = T, quote = "")
# The two bed files contain dictionaries between GRCh37 and GRCh38 SNV coordinates. I think this was done using Bioconductor liftOver
# Combined_408_ASD30-1000_GRCh38_FORALLELES.bed contains the exact amplicon coordinates for STAR408, ASD30 and ASD100 libraries; and the 1700 bp guesstimates for the ASD1K library, extended 850 bp each direction from SNV coordinates.
# Combined_408_ASD30-1000_GRCh38_unflanked.bed contains SNV coordinates for ASD1K
# This is not necessary because GRCh37 coordinates were later estimated from isPCR (in_silico_expected_chr_GRCh37)
d <- read.table("Coordinates/Combined_408_ASD30-1000_GRCh38_FORALLELES.bed")
e <- read.table("Coordinates/Combined_408_ASD30-1000_GRCh38_unflanked.bed")
# Examples of amplicon ID formats:
# STAR408: "1_CACNA1C or 1UN_CACNA1C", "41_Epilepsy", UN stands for unique or non-overlapping
# ASD70: "27_12303.p1_Amp", all have _Amp
# ASD30: "1-13111.p1" number - patient number p1 or s1
# ASD1k: "1340_3_56717417_A_T_13043.s1"
# A function annotating amplicons by library type
lib_annotation <- function(x){
x$library <- ifelse(
grepl("^\\d+(UN)?_[A-Za-z0-9]+$", x$V4, perl=TRUE), "STAR408", "")
x$library <- ifelse(
grepl("_Control_", x$V4, perl=TRUE), "STAR408", x$library)
x$library <- ifelse(
grepl("^.*_Amp$", x$V4, perl = TRUE), "ASD70", x$library)
x$library <- ifelse(
grepl("^\\d+-\\d+.[sp]1$", x$V4, perl = TRUE), "ASD30", x$library)
x$library <- ifelse(
grepl("^\\d+_\\d+_\\d+_([ATCG]*)_([ATCG]*)_\\d+.(\\w)\\d$", x$V4, perl = TRUE), "ASD1k", x$library)
colnames(x) <- c("Chr", "START_GRCh38", "END_GRCh38", "UNIQID", "Library")
x
}
#dim(d)
#dim(e) # e has 20 more rows
d <- lib_annotation(d)
e <- lib_annotation(e)
# These are the two missing ASD1k amplicons in d:
# "931_1_144520557_G_A_14547.p1"
# "2573_9_69219445_T_C_11218.p1"
#setdiff(e$UNIQID, d$UNIQID)
#setdiff(d$UNIQID, e$UNIQID)
#length(e$UNIQID) #2596
#length(d$UNIQID) #2576
#length(unique(e$UNIQID)) # 2578 There are some duplicated UNIQIDs in e
#length(unique(d$UNIQID)) # 2576
#nrow(filter(e, Library == "ASD1k")) # 1996
#nrow(filter(d, Library == "ASD1k")) # 1994
# Reading in silico PCR GRCh37 from pre-defined chromosomes data - DEPRICATED, delete
# Pre defined chromosomes refers to running isPCR only against a the expected chromosome.
#in_silico_expected_chr_GRCh37 <- read.csv("G:/Shared drives/Nord Lab - Data/SFRI00/ASD1000/ASD1KV5/Karol/In_silico_PCR/Pre_defined_chrom_in_silico_PCR_GRCh37/Amplicons_w_edge_coordinates_and_seq_pre_def_chrom_GRCh37_1_to_1997.csv")
# There are 1997 unique IDs, but even this chrom pre-determined algorithm found some off target products
#length(unique(df_in_silico$UNIQID)) # 1997
#length((df_in_silico$UNIQID)) # 2086
#
# one missing in the e object
#setdiff(unique(df_in_silico$UNIQID), filter(e, Library == "ASD1k")$UNIQID) #"1576_19_40388570_G_A_14009.s1" is missing
#filter(df_in_silico, UNIQID == "1576_19_40388570_G_A_14009.s1") # This amplicon is predicted to have a product.
# Below is Linda's code from 20200116_Combined_Analysis.R
# It indicates good reproducibility between the V4 and V5 replicates.
# I modified it adding a filtering step for low represented amplicons with less than 100 counts across DNA samples
## merge data: 2576 x 40
data.merge <- merge(data.v5, data.v4, by = "Amplicon")
data.merge$Undetermined_S0.x <- NULL
data.merge$Undetermined_S0.y <- NULL
# NOTE - add filtering column instead of hard thresholding - consider at later stage in the analysis
# Filtering out amplicons with less than 100 reads at DNA level
# data.merge <- data.merge[rowSums(data.merge[,grepl("AKg",colnames(data.merge))]) > 100,] # down to 1197
# Removing all non ASD1k amplicons which may increase the consistency between the V4 and V5 replicates.
## Add some colors to check for contamination
data.merge <- merge(data.merge, d[,c("UNIQID", "Library")], by.x = "Amplicon", by.y = "UNIQID")
data.merge_ASD1k <- dplyr::filter(data.merge, Library == "ASD1k")
data.merge$Library <- NULL
data.merge_ASD1k$Library <- NULL
# Calculate proportion
amp.prop <- data.merge_ASD1k
rownames(amp.prop) <- amp.prop$Amplicon
amp.prop$Amplicon <- NULL
samples <- colnames(amp.prop)
# Proportion function here: (x+1)/(sum(x)+1) # +1 is padding
## KC: again, the coding practive below is really bad.
amp.prop <- as.data.frame(apply(amp.prop, 2, function(x) { (x+1)/(sum(x, na.rm = T)+1) }))
amp.prop$Amplicon <- rownames(amp.prop)
amp.prop.plot <- amp.prop
#dim(amp.prop.plot)
# GGpairs diagnostic plots
setwd(wd)
# Proportiona DNA
p1 <- ggpairs(amp.prop.plot[, grepl("Kg|Kv", colnames(amp.prop.plot))],
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3)))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
p1
# Proportion log2 DNA
p2 <- ggpairs(log2(amp.prop.plot[, grepl("Kg|Kv", colnames(amp.prop.plot))]),
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3)))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
p2
# GGpairs diagnostic plots
setwd(wd)
# Proportiona DNA
p3 <- ggpairs(amp.prop.plot[, grepl("Ko|Ks", colnames(amp.prop.plot))],
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3)))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
p3
# GGpairs diagnostic plots
setwd(wd)
# Proportiona DNA
p4 <- ggpairs(log2(amp.prop.plot[, grepl("Ko|Ks", colnames(amp.prop.plot))]),
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3)))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
p4
#Let's construct a metadata df
metadata <- data.frame("Sample_name" = c(colnames(data.v4)[2:18], colnames(data.v5)[2:21]))
metadata$Replicate <- ifelse(metadata$Sample_name %in% colnames(data.v4)[2:18], "V4", "V5")
class_of_nucleic_acid <- ifelse(grepl("AKg", metadata$Sample_name), "gDNA", NA)
class_of_nucleic_acid <- ifelse(grepl("AKs|AKo", metadata$Sample_name), "cDNA", class_of_nucleic_acid)
class_of_nucleic_acid <- ifelse(grepl("AKv", metadata$Sample_name), "virus", class_of_nucleic_acid)
metadata$Nucleic_acid <- class_of_nucleic_acid
# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[,
dplyr::filter(metadata, Replicate == "V4",
Nucleic_acid %in% c("gDNA", "virus"))$Sample_name]),
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3)))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
JT: The initial reverse transcription was done with either polyT primer (odt20, “o”) or with the STAR-rev primer (“s”). The purpose being that the S-primer sets should pre-enrich for our transcripts and have higher concentration of amplify-able cDNA for downstream steps.
KC: I’ll sum the reads of the “o” and “s” samples before calculating proportions. The purpose of combining these RNA samples is to represent biological, not technical variance in the Ratiometric activity.
# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[,
dplyr::filter(metadata, Replicate == "V4",
Nucleic_acid %in% c("cDNA"))$Sample_name]),
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3)))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[,
dplyr::filter(metadata, Replicate == "V5",
Nucleic_acid %in% c("gDNA", "virus"))$Sample_name]),
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3)))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[,
dplyr::filter(metadata, Replicate == "V5",
Nucleic_acid %in% c("cDNA"))$Sample_name]),
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3)))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
library(plotly)
pca_data <- prcomp(amp.prop.plot[,metadata$Sample_name], center = TRUE, scale = TRUE)
pca_data_var_expl <- pca_data$sdev^2 / sum(pca_data$sdev^2)
scree <- data.frame("PC" = paste0(seq(1:37)), "Var_exp" = pca_data_var_expl)
pca_data <- as.data.frame(pca_data$rotation)
pca_data$Sample_name <- rownames(pca_data)
pca_data <- merge(metadata, pca_data, by = "Sample_name")
pca_data$Nucleic_acid <- ifelse(pca_data$Nucleic_acid == "gDNA", "DNA", pca_data$Nucleic_acid)
pca_data$Nucleic_acid <- ifelse(pca_data$Nucleic_acid == "cDNA", "RNA", pca_data$Nucleic_acid)
pca_data$Nucleic_acid <- ifelse(pca_data$Nucleic_acid == "virus", "Virus MaxiPrep", pca_data$Nucleic_acid)
# RT primer type
pca_data_RNA <- dplyr::filter(pca_data, Nucleic_acid == "RNA")
pca_data_RNA$RT_primer <- ifelse(grepl("o", pca_data_RNA$Sample_name), "OdT20", "STAR-rev")
#pca_data_RNA$Sample_name[grepl("o", pca_data_RNA$Sample_name, "Oligo", "STAR")]
pca_data <- merge(pca_data_RNA[, c("Sample_name", "RT_primer")], pca_data, all = TRUE)
pca_data$RT_primer <- ifelse(is.na(pca_data$RT_primer), "NA", pca_data$RT_primer)
#head(pca_data)
ggplot(scree[1:5,], aes(x = PC, y = Var_exp))+
geom_bar(stat = "identity")+
scale_y_continuous(breaks=seq(0, 1, 0.1))+
theme_bw()+
labs(title = "Scree plot", x = "PC", y = "Variance explained")+
theme(plot.title = element_text(hjust = 0.5))
The first 2 PCs account for 95% of variation
p <- ggplot(pca_data, aes(x = PC1, y = PC2, color = Nucleic_acid, shape = Replicate, label = Sample_name))+
geom_point(alpha = 0.7)+
theme_bw()+
labs(title = "PC1 vs PC2")+
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p)
PC2 separates DNA and RNA samples. PC1 separates DNA and RNA samples, with the exception of 3 V4 samples.
p <- ggplot(pca_data, aes(x = PC2, y = PC3, color = Nucleic_acid, shape = Replicate, label = Sample_name))+
geom_point(alpha = 0.7)+
theme_bw()+
labs(title = "PC2 vs PC3")+
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p)
PC3 separates V4 and V5 RNA samples
pca_data <- prcomp(amp.prop.plot[,dplyr::filter(metadata, Nucleic_acid == "cDNA")$Sample_name], center = TRUE, scale = TRUE)
pca_data_var_expl <- pca_data$sdev^2 / sum(pca_data$sdev^2)
scree <- data.frame("PC" = paste0(seq(1:23)), "Var_exp" = pca_data_var_expl)
pca_data <- as.data.frame(pca_data$rotation)
pca_data$Sample_name <- rownames(pca_data)
pca_data <- merge(metadata, pca_data, by = "Sample_name")
pca_data$Nucleic_acid <- ifelse(pca_data$Nucleic_acid == "cDNA", "RNA", pca_data$Nucleic_acid)
# RT primer type
pca_data_RNA <- dplyr::filter(pca_data, Nucleic_acid == "RNA")
pca_data_RNA$RT_primer <- ifelse(grepl("o", pca_data_RNA$Sample_name), "OdT20", "STAR-rev")
#pca_data_RNA$Sample_name[grepl("o", pca_data_RNA$Sample_name, "Oligo", "STAR")]
pca_data <- merge(pca_data_RNA[, c("Sample_name", "RT_primer")], pca_data, all = TRUE)
pca_data$RT_primer <- ifelse(is.na(pca_data$RT_primer), "NA", pca_data$RT_primer)
pca_data$Biol_rep <- gsub('.*?([0-9]+).*', '\\1', pca_data$Sample_name)
ggplot(scree[1:5,], aes(x = PC, y = Var_exp))+
geom_bar(stat = "identity")+
scale_y_continuous(breaks=seq(0, 1, 0.1))+
theme_bw()+
labs(title = "Scree plot", x = "", y = "Variance explained")+
theme(plot.title = element_text(hjust = 0.5))
The first 2 PCs account for 98% of variation
p <- ggplot(pca_data, aes(x = PC1, y = PC2, color = RT_primer, shape = Replicate, label = Sample_name))+
geom_point(alpha = 0.7)+
theme_bw()+
labs(title = "PC1 vs PC2")+
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p)
PC1 and PC2 separates V4 and V5 samples. RT_primer does not separate samples along PC1 or PC2
library(ggrepel)
p <- ggplot(pca_data, aes(x = PC1, y = PC2, color = Biol_rep, shape = Replicate, label = Sample_name))+
geom_point(alpha = 0.7)+
theme_bw()+
labs(title = "PC1 vs PC2")+
theme(plot.title = element_text(hjust = 0.5))+
geom_text_repel()
p
Clustering of color-coded technical replicates, PC1 vs PC2
library(ggrepel)
p <- ggplot(pca_data, aes(x = PC2, y = PC3, color = Biol_rep, shape = Replicate, label = Sample_name))+
geom_point(alpha = 0.7)+
theme_bw()+
labs(title = "PC2 vs PC3")+
theme(plot.title = element_text(hjust = 0.5))+
geom_text_repel()
p
Clustering of color-coded technical replicates, PC2 vs PC3
# Sum counts of technical replicates
df <- data.merge_ASD1k
data.frame("Amplicon" = df$Amplicon,
)
AKg49_S40 V4 gDNA
AKg50_S41 V4 gDNA
AKg51_S42 V4 gDNA
AKg52_S43 V4 gDNA
AKg53_S44 V4 gDNA
rownames(amp.prop) <- amp.prop$Amplicon
amp.prop$Amplicon <- NULL
samples <- colnames(amp.prop)
# Proportion function here: (x+1)/(sum(x)+1) # +1 is padding
## KC: again, the coding practive below is really bad.
amp.prop <- as.data.frame(apply(amp.prop, 2, function(x) { (x+1)/(sum(x, na.rm = T)+1) }))
amp.prop$Amplicon <- rownames(amp.prop)
amp.prop.plot <- amp.prop
## Manually calculate ratiometric activity (RNA/DNA)
amp.act <- data.frame("amp_id" = rownames(amp.prop.plot), "Color" = "ASD1k",
"Ratio.V5.S.54" = 0, "Ratio.V5.S.55" = 0, "Ratio.V5.S.55b" = 0, "Ratio.V5.S.55c" = 0,
"Ratio.V5.S.56" = 0, "Ratio.V5.S.56b" = 0, "Ratio.V5.S.56c" = 0, "Ratio.V5.S.57" = 0,
"Ratio.V5.S.69" = 0, "Ratio.V5.S.70" = 0, "Ratio.V5.S.71" = 0, "Ratio.V5.S.71b" = 0, "Ratio.V5.S.71c" = 0,
"Ratio.V4.O.49" = 0, "Ratio.V4.O.50" = 0, "Ratio.V4.O.51" = 0, "Ratio.V4.O.52" = 0, "Ratio.V4.O.53" = 0,
"Ratio.V4.S.49" = 0, "Ratio.V4.S.50" = 0, "Ratio.V4.S.51" = 0, "Ratio.V4.S.52" = 0, "Ratio.V4.S.53" = 0
)
# Identify 10 amplicons which are inactive.
## manually define ratiometric activity for V5
amp.act$Ratio.V5.S.54 <- (amp.prop.plot$AKs54_S90) / (amp.prop.plot$AKg54_S83)
amp.act$Ratio.V5.S.55 <- (amp.prop.plot$AKs55_S91) / (amp.prop.plot$AKg55_S84)
amp.act$Ratio.V5.S.55b <- (amp.prop.plot$AKs55b_S97) / (amp.prop.plot$AKg55_S84)
amp.act$Ratio.V5.S.55c <- (amp.prop.plot$AKs55c_S100) / (amp.prop.plot$AKg55_S84)
amp.act$Ratio.V5.S.56 <- (amp.prop.plot$AKs56_S92) / (amp.prop.plot$AKg56_S85)
amp.act$Ratio.V5.S.56b <- (amp.prop.plot$AKs56b_S98) / (amp.prop.plot$AKg56_S85)
amp.act$Ratio.V5.S.56c <- (amp.prop.plot$AKs56c_S101) / (amp.prop.plot$AKg56_S85)
amp.act$Ratio.V5.S.57 <- (amp.prop.plot$AKs57_S93) / (amp.prop.plot$AKg57_S86)
amp.act$Ratio.V5.S.69 <- (amp.prop.plot$AKs69_S94) / (amp.prop.plot$AKg69_S87)
amp.act$Ratio.V5.S.70 <- (amp.prop.plot$AKs70_S95) / (amp.prop.plot$AKg70_S88)
amp.act$Ratio.V5.S.71 <- (amp.prop.plot$AKs71_S96) / (amp.prop.plot$AKg71_S89)
amp.act$Ratio.V5.S.71b <- (amp.prop.plot$AKs71b_S99) / (amp.prop.plot$AKg71_S89)
amp.act$Ratio.V5.S.71c <- (amp.prop.plot$AKs71c_S102) / (amp.prop.plot$AKg71_S89)
## manually define ratiometric activity for V4
amp.act$Ratio.V4.O.49 <- (amp.prop.plot$AKo49_S50) / (amp.prop.plot$AKg49_S40)
amp.act$Ratio.V4.O.50 <- (amp.prop.plot$AKo50_S51) / (amp.prop.plot$AKg50_S41)
amp.act$Ratio.V4.O.51 <- (amp.prop.plot$AKo51_S52) / (amp.prop.plot$AKg51_S42)
amp.act$Ratio.V4.O.52 <- (amp.prop.plot$AKo52_S53) / (amp.prop.plot$AKg52_S43)
amp.act$Ratio.V4.O.53 <- (amp.prop.plot$AKo53_S54) / (amp.prop.plot$AKg53_S44)
amp.act$Ratio.V4.S.49 <- (amp.prop.plot$AKs49_S45) / (amp.prop.plot$AKg49_S40)
amp.act$Ratio.V4.S.50 <- (amp.prop.plot$AKs50_S46) / (amp.prop.plot$AKg50_S41)
amp.act$Ratio.V4.S.51 <- (amp.prop.plot$AKs51_S47) / (amp.prop.plot$AKg51_S42)
amp.act$Ratio.V4.S.52 <- (amp.prop.plot$AKs52_S48) / (amp.prop.plot$AKg52_S43)
amp.act$Ratio.V4.S.53 <- (amp.prop.plot$AKs53_S49) / (amp.prop.plot$AKg53_S44)
## Manually calculate ratiometric activity (RNA/DNA)
amp.act <- data.frame("amp_id" = rownames(amp.prop.plot), "Color" = "ASD1k",
"Ratio.V4.O.49" = 0,
"Ratio.V4.O.50" = 0,
"Ratio.V4.O.51" = 0,
"Ratio.V4.O.52" = 0,
"Ratio.V4.O.53" = 0,
"Ratio.V4.S.49" = 0,
"Ratio.V4.S.50" = 0,
"Ratio.V4.S.51" = 0,
"Ratio.V4.S.52" = 0,
"Ratio.V4.S.53" = 0,
"Ratio.V5.S.54" = 0,
"Ratio.V5.S.55" = 0,
"Ratio.V5.S.55b" = 0,
"Ratio.V5.S.55c" = 0,
"Ratio.V5.S.56" = 0,
"Ratio.V5.S.56b" = 0,
"Ratio.V5.S.56c" = 0,
"Ratio.V5.S.57" = 0,
"Ratio.V5.S.69" = 0,
"Ratio.V5.S.70" = 0,
"Ratio.V5.S.71" = 0,
"Ratio.V5.S.71b" = 0,
"Ratio.V5.S.71c" = 0
)
## manually define ratiometric activity for V5
amp.act$Ratio.V5.S.54 <- (amp.prop.plot$AKs54_S90) / (amp.prop.plot$AKg54_S83)
amp.act$Ratio.V5.S.55 <- (amp.prop.plot$AKs55_S91) / (amp.prop.plot$AKg55_S84)
amp.act$Ratio.V5.S.55b <- (amp.prop.plot$AKs55b_S97) / (amp.prop.plot$AKg55_S84)
amp.act$Ratio.V5.S.55c <- (amp.prop.plot$AKs55c_S100) / (amp.prop.plot$AKg55_S84)
amp.act$Ratio.V5.S.56 <- (amp.prop.plot$AKs56_S92) / (amp.prop.plot$AKg56_S85)
amp.act$Ratio.V5.S.56b <- (amp.prop.plot$AKs56b_S98) / (amp.prop.plot$AKg56_S85)
amp.act$Ratio.V5.S.56c <- (amp.prop.plot$AKs56c_S101) / (amp.prop.plot$AKg56_S85)
amp.act$Ratio.V5.S.57 <- (amp.prop.plot$AKs57_S93) / (amp.prop.plot$AKg57_S86)
amp.act$Ratio.V5.S.69 <- (amp.prop.plot$AKs69_S94) / (amp.prop.plot$AKg69_S87)
amp.act$Ratio.V5.S.70 <- (amp.prop.plot$AKs70_S95) / (amp.prop.plot$AKg70_S88)
amp.act$Ratio.V5.S.71 <- (amp.prop.plot$AKs71_S96) / (amp.prop.plot$AKg71_S89)
amp.act$Ratio.V5.S.71b <- (amp.prop.plot$AKs71b_S99) / (amp.prop.plot$AKg71_S89)
amp.act$Ratio.V5.S.71c <- (amp.prop.plot$AKs71c_S102) / (amp.prop.plot$AKg71_S89)
## manually define ratiometric activity for V4
amp.act$Ratio.V4.O.49 <- (amp.prop.plot$AKo49_S50) / (amp.prop.plot$AKg49_S40)
amp.act$Ratio.V4.O.50 <- (amp.prop.plot$AKo50_S51) / (amp.prop.plot$AKg50_S41)
amp.act$Ratio.V4.O.51 <- (amp.prop.plot$AKo51_S52) / (amp.prop.plot$AKg51_S42)
amp.act$Ratio.V4.O.52 <- (amp.prop.plot$AKo52_S53) / (amp.prop.plot$AKg52_S43)
amp.act$Ratio.V4.O.53 <- (amp.prop.plot$AKo53_S54) / (amp.prop.plot$AKg53_S44)
amp.act$Ratio.V4.S.49 <- (amp.prop.plot$AKs49_S45) / (amp.prop.plot$AKg49_S40)
amp.act$Ratio.V4.S.50 <- (amp.prop.plot$AKs50_S46) / (amp.prop.plot$AKg50_S41)
amp.act$Ratio.V4.S.51 <- (amp.prop.plot$AKs51_S47) / (amp.prop.plot$AKg51_S42)
amp.act$Ratio.V4.S.52 <- (amp.prop.plot$AKs52_S48) / (amp.prop.plot$AKg52_S43)
amp.act$Ratio.V4.S.53 <- (amp.prop.plot$AKs53_S49) / (amp.prop.plot$AKg53_S44)
# Activity sample corelation plots
## Linear
p <- ggpairs(log2(amp.act[, grepl("Ratio", colnames(amp.act))]),
lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 1),
combo = "box", discrete = "blank", na = "na"),
diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4),
na = "blankDiag"),
upper = list(continuous = wrap('cor', size = 3))) +
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(size = 10),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
p
#Activity Mean and SD
library(genefilter)
library(reshape2)
library(data.table)
amp.act_summary <- amp.act[,1:2]
amp.act_summary$Mean_V5 <- rowMeans(amp.act[,c(3:15)])
amp.act_summary$Mean_V4 <- rowMeans(amp.act[,c(16:25)])
amp.act_summary$Mean_V4.O <- rowMeans(amp.act[,c(16:20)])
amp.act_summary$Mean_V4.S <- rowMeans(amp.act[,c(21:20)])
amp.act_summary$Mean_All <- rowMeans(amp.act[,c(3:25)])
amp.act_summary$SD_V5 <- rowSds(as.matrix(amp.act[,c(3:15)]))
amp.act_summary$SD_V4 <- rowSds(as.matrix(amp.act[,c(16:25)]))
amp.act_summary$SD_V4.O <- rowSds(as.matrix(amp.act[,c(16:20)]))
amp.act_summary$SD_V4.S <- rowSds(as.matrix(amp.act[,c(21:20)]))
amp.act_summary$SD_All <- rowSds(as.matrix(amp.act[,c(3:25)]))
# Proportions
#
#colnames(amp.prop.plot)
#colnames(data.v4)
#colnames(data.v5)
V4_gDNA_samples <- dplyr::filter(metadata, Replicate == "V4", Nucleic_acid == "gDNA")$Sample_name
V4_cDNA_samples <- dplyr::filter(metadata, Replicate == "V4", Nucleic_acid == "cDNA")$Sample_name
V5_gDNA_samples <- dplyr::filter(metadata, Replicate == "V5", Nucleic_acid == "gDNA")$Sample_name
V5_cDNA_samples <- dplyr::filter(metadata, Replicate == "V5", Nucleic_acid == "cDNA")$Sample_name
gDNA_samples <- dplyr::filter(metadata, Nucleic_acid == "gDNA")$Sample_name
cDNA_samples <- dplyr::filter(metadata, Nucleic_acid == "cDNA")$Sample_name
amp.prop.plot$Color <- "ASD1k"
amp.prop_summary <- amp.prop.plot[,c("Amplicon", "Color")]
rownames(amp.prop_summary) <- NULL
#head(amp.prop_summary)
amp.prop_summary$Mean_gDNA <- rowMeans(amp.prop.plot[,gDNA_samples])
amp.prop_summary$Mean_cDNA <- rowMeans(amp.prop.plot[,cDNA_samples])
amp.prop_summary$Mean_gDNA_V4 <- rowMeans(amp.prop.plot[,V4_gDNA_samples])
amp.prop_summary$Mean_cDNA_V4 <- rowMeans(amp.prop.plot[,V4_cDNA_samples])
amp.prop_summary$Mean_gDNA_V5 <- rowMeans(amp.prop.plot[,V5_gDNA_samples])
amp.prop_summary$Mean_cDNA_V5 <- rowMeans(amp.prop.plot[,V5_cDNA_samples])
amp.prop_summary$SD_gDNA <- rowSds(amp.prop.plot[,gDNA_samples])
amp.prop_summary$SD_cDNA <- as.numeric(rowSds(amp.prop.plot[,cDNA_samples]))
amp.prop_summary$SD_gDNA_V4 <- as.numeric(rowSds(amp.prop.plot[,V4_gDNA_samples]))
amp.prop_summary$SD_cDNA_V4 <- as.numeric(rowSds(amp.prop.plot[,V4_cDNA_samples]))
amp.prop_summary$SD_gDNA_V5 <- as.numeric(rowSds(amp.prop.plot[,V5_gDNA_samples]))
amp.prop_summary$SD_cDNA_V5 <- as.numeric(rowSds(amp.prop.plot[,V5_cDNA_samples]))
# Applying a threshold on proportions
amp.prop_summary$Pass_prop_gDNA_and_cDNA <- ifelse(amp.prop_summary$Mean_gDNA > 2^-15 &
amp.prop_summary$Mean_cDNA > 2^-15, TRUE, FALSE)
## Add Boolean columns denoting min count tests
min_count = 100
min_count_gDNA <- rowSums(data.merge[,gDNA_samples]>min_count) >= ncol(data.merge[,gDNA_samples])
min_count_gDNA_V4 <- rowSums(data.merge[,V4_gDNA_samples]>min_count) >= ncol(data.merge[,V4_gDNA_samples])
min_count_gDNA_V5 <- rowSums(data.merge[,V5_gDNA_samples]>min_count) >= ncol(data.merge[,V5_gDNA_samples])
min_count_cDNA <- rowSums(data.merge[,cDNA_samples]>min_count) >= ncol(data.merge[,cDNA_samples])
min_count_cDNA_V4 <- rowSums(data.merge[,V4_cDNA_samples]>min_count) >= ncol(data.merge[,V4_cDNA_samples])
min_count_cDNA_V5 <- rowSums(data.merge[,V5_cDNA_samples]>min_count) >= ncol(data.merge[,V5_cDNA_samples])
count_Booleans <- data.frame(
"Amplicon" = data.merge$Amplicon,
"min_count_gDNA" = min_count_gDNA,
"min_count_gDNA_V4" = min_count_gDNA_V4,
"min_count_gDNA_V5" = min_count_gDNA_V5,
"min_count_cDNA" = min_count_cDNA,
"min_count_cDNA_V4" = min_count_cDNA_V4,
"min_count_cDNA_V5" = min_count_cDNA_V5
)
# Building a comprehensive data frame
y <- merge(amp.prop_summary, count_Booleans, by = "Amplicon") # 1994 rows, Only ASD1k amplicons
y_ASD1k <- dplyr::filter(y, Color == "ASD1k") # 1994 rows
library(ggplot2)
library(genefilter)
library(ggpmisc)
# A rectangle delinating DNA and RNA proportion thresholds < 2^-15
d=data.frame(x1=c(-Inf, -Inf), x2=c(+Inf, -15), y1=c(-Inf,-15), y2=c(-15, +Inf))
# Passing min gDNA counts on all samples
p1 <- ggplot()+
geom_point(data = y_ASD1k, aes(x = log2(Mean_gDNA),
y = log2(Mean_cDNA),
color = min_count_gDNA), alpha = 0.3)+
theme_bw()+
geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
labs(title = "Amplicons with > 100 counts in all DNA")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
p2 <- ggplot()+
geom_point(data = y_ASD1k, aes(x = log2(Mean_gDNA), y = log2(Mean_cDNA),
color = min_count_gDNA_V4), alpha = 0.3)+
theme_bw()+
geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
labs(title = "Amplicons with > 100 counts in V4 replicate DNA")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
p3 <- ggplot()+
geom_point(data = y_ASD1k, aes(x = log2(Mean_gDNA), y = log2(Mean_cDNA),
color = min_count_gDNA_V5), alpha = 0.3)+
theme_bw()+
geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
labs(title = "Amplicons with > 100 counts in V5 replicate DNA")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
# Passing min gDNA counts on all samples
y$min_count_gDNA_and_cDNA <- y$min_count_cDNA & y$min_count_gDNA
y_ASD1k$min_count_gDNA_and_cDNA <- y_ASD1k$min_count_cDNA & y_ASD1k$min_count_gDNA
p4 <- ggplot()+
geom_point(data = y_ASD1k, aes(x = log2(Mean_gDNA), y = log2(Mean_cDNA),
color = min_count_gDNA_and_cDNA), alpha = 0.3)+
theme_bw()+
geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
labs(title = "Amplicons with > 100 counts in DNA and RNA samples")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
plot_grid(p1, p2, p3, p4,
labels = c("A", "B", "C", "D"),
rel_widths = c(1.2, 1.2),
vjust = 0.9)
### Dataset for lm ###
# Filtering criteria:
# 1. > 100 reads across all gDNA samples, and > 100 across all cDNA samples
# 2. Mean gDNA proportions and mean cDNA proportions > 2^-15
y$lm_dataset <- y$min_count_gDNA_and_cDNA & y$Pass_prop_gDNA_and_cDNA
y_ASD1k$lm_dataset <- y_ASD1k$min_count_gDNA_and_cDNA & y_ASD1k$Pass_prop_gDNA_and_cDNA
p1 <- ggplot()+
geom_point(data = y_ASD1k, aes(x = log2(Mean_gDNA), y = log2(Mean_cDNA), color = lm_dataset), alpha = 0.3)+
theme_bw()+
geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
labs(title = "Dataset cleaning for building the lm model")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
## plotting lm dataset ##
y_ASD1k_lm <- dplyr::filter(y_ASD1k, lm_dataset == TRUE)
formula <- y ~ x
p2 <- ggplot(y_ASD1k_lm, aes(x = log2(Mean_gDNA), y = log2(Mean_cDNA)))+
geom_point(alpha = 0.3)+
theme_bw()+
geom_abline(intercept = 0, color = "red", linetype = 2)+
geom_smooth(formula = formula, method = 'lm', color = "black")+
stat_poly_eq(formula = formula,
aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")),
parse = TRUE)+
labs(title = "Simple lm fit to the cleaned data, n = 748 amplicons")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
plot_grid(p1, p2,
labels = c("A", "B"),
rel_widths = c(1.2, 1.2),
vjust = 0.9)
# save.image("G:/Shared drives/Nord Lab - Data/SFRI00/ASD1000/ASD1KV5/Karol/ASD1k_lm_10_07_2020.RData")
# load("G:/Shared drives/Nord Lab - Data/SFRI00/ASD1000/ASD1KV5/Karol/ASD1k_lm_10_07_2020.RData")
# setwd("G:/Shared drives/Nord Lab - Data/SFRI00/ASD1000/ASD1KV5/Karol/")
# I'm using GC values from in silico PCR prediction, limiting the subset of amplicons to those with only 1 predicted product/
#filter(d2, Perfect_In_silico_specificity == TRUE)
# Merging GC dataset with perfect in silico specificity reduced amplicon number from 748 to 730. Not much loss!
y_ASD1k_lm_GC <- merge(y_ASD1k_lm, dplyr::filter(d2, Perfect_In_silico_specificity == TRUE)[,c(1, 17)], by.x = "Amplicon", by.y = "UNIQID")
# 727 amplicons after removing 3 without isPCR sequence
y_ASD1k_lm_GC <- na.omit(y_ASD1k_lm_GC)
library(ggplot2)
formula <- y ~ x
y_ASD1k_lm_GC$GC_ab_mean <- ifelse(y_ASD1k_lm_GC$GC_content > mean(na.omit(y_ASD1k_lm_GC$GC_content)), TRUE, FALSE)
#y_ASD1k_lm_GC[-c(252, 499, 576),] # Removes rows containing GC values
# na.omit(y_ASD1k_lm_GC) - the same, just simpler
ggplot(na.omit(y_ASD1k_lm_GC),
aes(x = log2(Mean_gDNA), y = log2(Mean_cDNA),
color = GC_content))+
geom_point(alpha = 0.5)+
theme_bw()+
geom_abline(intercept = 0, color = "red", linetype = 2)+
geom_smooth(formula = formula, method = 'lm', color = "black")+
stat_poly_eq(formula = formula,
aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")),
parse = TRUE)+
scale_color_gradient2(midpoint = mean(na.omit(y_ASD1k_lm_GC$GC_content)), low="blue3", mid="white",
high="red3", space ="Lab" )+
facet_wrap(~ GC_ab_mean)+
labs(title = "DNA vs RNA proportions, split at mean GC content")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
There seems to be limited if any correlation of GC content and amplicon activity
# data.all$MeanRatio_MoM <- data.all$RNA_prop_mean / data.all$DNA_prop_mean
y_ASD1k_lm_GC$MeanRatio_All <- y_ASD1k_lm_GC$Mean_cDNA / y_ASD1k_lm_GC$Mean_gDNA
y_ASD1k_lm_GC$MeanRatio_V4 <- y_ASD1k_lm_GC$Mean_cDNA_V4/ y_ASD1k_lm_GC$Mean_gDNA_V4
y_ASD1k_lm_GC$MeanRatio_V5 <- y_ASD1k_lm_GC$Mean_cDNA_V5/ y_ASD1k_lm_GC$Mean_gDNA_V5
formula <- y ~ x
GC_cont_plot <- function(x, y){
ggplot(y_ASD1k_lm_GC, aes(x = log2(get(x)), y = GC_content))+
geom_point(alpha = 0.5)+
theme_bw()+
geom_smooth(formula = formula, method = 'lm', color = "black")+
stat_poly_eq(formula = formula,
aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")),
parse = TRUE)+
labs(title = y, x = "log2(mean DNA prop. / mean RNA prop.)")+
theme(plot.title = element_text(hjust = 0.5, size = 14),
text = element_text(size = 12))
}
plot_grid(GC_cont_plot("MeanRatio_All", "V4 + V5 combined"),
GC_cont_plot("MeanRatio_V4", "V4"),
GC_cont_plot("MeanRatio_V5", "V5"),
nrow = 1,
labels = c("A", "B", "C"),
vjust = 0.9)
V4 and V5 replicates demonstrate oposing GC bias, but the correlation magnitude is small
lm_all_GC <- lm(log2(Mean_cDNA) ~ log2(Mean_gDNA) + GC_content, data = y_ASD1k_lm_GC)
lm_V4_GC <- lm(log2(Mean_cDNA_V4) ~ log2(Mean_gDNA_V4) + GC_content, data = y_ASD1k_lm_GC)
lm_V5_GC <- lm(log2(Mean_cDNA_V5) ~ log2(Mean_gDNA_V5) + GC_content, data = y_ASD1k_lm_GC)
lm_all <- lm(log2(Mean_cDNA) ~ log2(Mean_gDNA), data = y_ASD1k_lm_GC)
lm_V4 <- lm(log2(Mean_cDNA_V4) ~ log2(Mean_gDNA_V4), data = y_ASD1k_lm_GC)
lm_V5 <- lm(log2(Mean_cDNA_V5) ~ log2(Mean_gDNA_V5), data = y_ASD1k_lm_GC)
# Add AIC and BIC
lm_all_GC$AIC <- round(AIC(lm_all_GC), 1)
lm_V4_GC$AIC <- round(AIC(lm_V4_GC), 1)
lm_V5_GC$AIC <- round(AIC(lm_V5_GC), 1)
lm_all$AIC <- round(AIC(lm_all), 1)
lm_V4$AIC <- round(AIC(lm_V4), 1)
lm_V5$AIC <- round(AIC(lm_V5), 1)
lm_all_GC$BIC <- round(BIC(lm_all_GC), 1)
lm_V4_GC$BIC <- round(BIC(lm_V4_GC), 1)
lm_V5_GC$BIC <- round(BIC(lm_V5_GC), 1)
lm_all$BIC <- round(BIC(lm_all), 1)
lm_V4$BIC <- round(BIC(lm_V4), 1)
lm_V5$BIC <- round(BIC(lm_V5), 1)
summary(lm_all)
##
## Call:
## lm(formula = log2(Mean_cDNA) ~ log2(Mean_gDNA), data = y_ASD1k_lm_GC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.95899 -0.58712 0.04132 0.65069 2.45535
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.05850 0.34990 -3.025 0.00257 **
## log2(Mean_gDNA) 0.90415 0.03473 26.032 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9388 on 725 degrees of freedom
## Multiple R-squared: 0.4831, Adjusted R-squared: 0.4824
## F-statistic: 677.7 on 1 and 725 DF, p-value: < 2.2e-16
summary(lm_all_GC)
##
## Call:
## lm(formula = log2(Mean_cDNA) ~ log2(Mean_gDNA) + GC_content,
## data = y_ASD1k_lm_GC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.97285 -0.59137 0.05943 0.64091 2.49391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.93808 0.44120 -2.126 0.0338 *
## log2(Mean_gDNA) 0.90734 0.03547 25.579 <2e-16 ***
## GC_content -0.18911 0.42163 -0.449 0.6539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9393 on 724 degrees of freedom
## Multiple R-squared: 0.4833, Adjusted R-squared: 0.4818
## F-statistic: 338.6 on 2 and 724 DF, p-value: < 2.2e-16
summary(lm_V4)
##
## Call:
## lm(formula = log2(Mean_cDNA_V4) ~ log2(Mean_gDNA_V4), data = y_ASD1k_lm_GC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6134 -0.6992 0.0617 0.8037 2.9775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.11298 0.45327 2.455 0.0143 *
## log2(Mean_gDNA_V4) 1.14188 0.04512 25.307 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.152 on 725 degrees of freedom
## Multiple R-squared: 0.469, Adjusted R-squared: 0.4683
## F-statistic: 640.4 on 1 and 725 DF, p-value: < 2.2e-16
summary(lm_V4_GC)
##
## Call:
## lm(formula = log2(Mean_cDNA_V4) ~ log2(Mean_gDNA_V4) + GC_content,
## data = y_ASD1k_lm_GC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6857 -0.7306 0.0315 0.7504 2.6988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.87641 0.54531 -1.607 0.108
## log2(Mean_gDNA_V4) 1.08936 0.04479 24.321 < 2e-16 ***
## GC_content 3.13066 0.50282 6.226 8.09e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.123 on 724 degrees of freedom
## Multiple R-squared: 0.496, Adjusted R-squared: 0.4946
## F-statistic: 356.3 on 2 and 724 DF, p-value: < 2.2e-16
summary(lm_V5)
##
## Call:
## lm(formula = log2(Mean_cDNA_V5) ~ log2(Mean_gDNA_V5), data = y_ASD1k_lm_GC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3487 -0.6218 0.0221 0.6649 2.4482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.65392 0.33981 -7.81 2.01e-14 ***
## log2(Mean_gDNA_V5) 0.73914 0.03362 21.98 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9635 on 725 degrees of freedom
## Multiple R-squared: 0.4, Adjusted R-squared: 0.3992
## F-statistic: 483.3 on 1 and 725 DF, p-value: < 2.2e-16
summary(lm_V5_GC)
##
## Call:
## lm(formula = log2(Mean_cDNA_V5) ~ log2(Mean_gDNA_V5) + GC_content,
## data = y_ASD1k_lm_GC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4787 -0.6280 0.0354 0.6960 2.4584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.34304 0.43017 -3.122 0.00187 **
## log2(Mean_gDNA_V5) 0.77329 0.03385 22.844 < 2e-16 ***
## GC_content -2.06901 0.42660 -4.850 1.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9488 on 724 degrees of freedom
## Multiple R-squared: 0.4189, Adjusted R-squared: 0.4173
## F-statistic: 260.9 on 2 and 724 DF, p-value: < 2.2e-16
library(stargazer)
stargazer(lm_all, lm_all_GC, lm_V4, lm_V4_GC, lm_V5, lm_V5_GC,
title="Regression Results",
single.row = TRUE,
font.size = "huge",
dep.var.labels.include = TRUE,
column.labels=c("V4 + V5", "V4 + V5 w GC", "V4", "V4 w GC", "V5", "V5 w GC"),
align=TRUE,
type = "html",
keep.stat=c("aic", "bic", "rsq", "n"),
report=('vc*p'))
| Dependent variable: | ||||||
| log2(Mean_cDNA) | log2(Mean_cDNA_V4) | log2(Mean_cDNA_V5) | ||||
| V4 + V5 | V4 + V5 w GC | V4 | V4 w GC | V5 | V5 w GC | |
| (1) | (2) | (3) | (4) | (5) | (6) | |
| log2(Mean_gDNA) | 0.904*** | 0.907*** | ||||
| p = 0.000 | p = 0.000 | |||||
| GC_content | -0.189 | 3.131*** | -2.069*** | |||
| p = 0.654 | p = 0.000 | p = 0.00001 | ||||
| log2(Mean_gDNA_V4) | 1.142*** | 1.089*** | ||||
| p = 0.000 | p = 0.000 | |||||
| log2(Mean_gDNA_V5) | 0.739*** | 0.773*** | ||||
| p = 0.000 | p = 0.000 | |||||
| Constant | -1.059*** | -0.938** | 1.113** | -0.876 | -2.654*** | -1.343*** |
| p = 0.003 | p = 0.034 | p = 0.015 | p = 0.109 | p = 0.000 | p = 0.002 | |
| Observations | 727 | 727 | 727 | 727 | 727 | 727 |
| R2 | 0.483 | 0.483 | 0.469 | 0.496 | 0.400 | 0.419 |
| Akaike Inf. Crit. | 1,975.300 | 1,977.100 | 2,272.600 | 2,236.700 | 2,013.000 | 1,991.800 |
| Bayesian Inf. Crit. | 1,989.000 | 1,995.400 | 2,286.400 | 2,255.100 | 2,026.800 | 2,010.100 |
| Note: | p<0.1; p<0.05; p<0.01 | |||||
y_ASD1k_lm_GC <- na.omit(y_ASD1k_lm_GC)
p1 <- ggplot(y_ASD1k_lm_GC, aes(x = log2(MeanRatio_V4), y = log2(MeanRatio_V5)))+
geom_point(alpha = 0.5)+
theme_bw()+
geom_hline(yintercept = 0)+
geom_vline(xintercept = 0)+
labs(title = "V4 vs V5 ratiometric activity")+
geom_smooth(formula = formula, method = 'lm', color = "black")+
stat_poly_eq(formula = formula,
aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")),
parse = TRUE)+
theme(plot.title = element_text(hjust = 0.5, size = 14),
text = element_text(size = 12))
y_ASD1k_lm_GC$V4_res <- lm_V4$residuals
y_ASD1k_lm_GC$V5_res <- lm_V5$residuals
y_ASD1k_lm_GC$V4_res_GC <- lm_V4_GC$residuals
y_ASD1k_lm_GC$V5_res_GC <- lm_V5_GC$residuals
p2 <- ggplot(y_ASD1k_lm_GC, aes(x = V4_res, y = V5_res))+
geom_point(alpha = 0.5)+
theme_bw()+
geom_hline(yintercept = 0)+
geom_vline(xintercept = 0)+
labs(title = "V4 vs V5 residuals in models without GC")+
geom_smooth(formula = formula, method = 'lm', color = "black")+
stat_poly_eq(formula = formula,
aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")),
parse = TRUE)+
theme(plot.title = element_text(hjust = 0.5, size = 14),
text = element_text(size = 12))
p3 <- ggplot(y_ASD1k_lm_GC, aes(x = V4_res_GC, y = V5_res_GC))+
geom_point(alpha = 0.5)+
theme_bw()+
geom_hline(yintercept = 0)+
geom_vline(xintercept = 0)+
labs(title = "V4 vs V5 residuals in models with GC")+
geom_smooth(formula = formula, method = 'lm', color = "black")+
stat_poly_eq(formula = formula,
aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")),
parse = TRUE)+
theme(plot.title = element_text(hjust = 0.5, size = 14),
text = element_text(size = 12))
plot_grid(p1, p2, p3,
nrow = 1,
labels = c("A", "B", "C"),
vjust = 0.9)